Plants Clustering

Dataset : https://archive.ics.uci.edu/ml/datasets/Plants

This dataset has been extracted from the USDA plants database. It contains all plants (species and genera) in the database and the states of USA and Canada where they occur.

The data is in the transactional form. It contains the Latin names (species or genus) and state abbreviations..Each row contains a Latin name (species or genus) and a list of state abbreviations.

Each row contains a Latin name (species or genus) and a list of state abbreviations.



In [8]:

    
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn import preprocessing
import matplotlib
import matplotlib.pyplot as plt



In [10]:

    
cols = ['Class']
for i in range(64):
    str = 'f{}'.format(i)
    cols.append(str)



In [11]:

    
# read .csv from provided dataset
csv_filename="data_Mar_64.txt"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename,names=cols)



In [12]:

    
df.head()









    Out[12]:






  
    
      
      Class
      f0
      f1
      f2
      f3
      f4
      f5
      f6
      f7
      f8
      ...
      f54
      f55
      f56
      f57
      f58
      f59
      f60
      f61
      f62
      f63
    
  
  
    
      0
      Acer Campestre
      0.003906
      0.003906
      0.027344
      0.033203
      0.007812
      0.017578
      0.023438
      0.005859
      0.000000
      ...
      0.011719
      0.000000
      0.005859
      0.035156
      0.027344
      0.033203
      0.001953
      0.000000
      0.017578
      0.0
    
    
      1
      Acer Campestre
      0.005859
      0.013672
      0.027344
      0.025391
      0.013672
      0.029297
      0.019531
      0.000000
      0.001953
      ...
      0.017578
      0.000000
      0.021484
      0.017578
      0.046875
      0.005859
      0.003906
      0.003906
      0.046875
      0.0
    
    
      2
      Acer Campestre
      0.011719
      0.001953
      0.027344
      0.044922
      0.017578
      0.042969
      0.023438
      0.000000
      0.003906
      ...
      0.035156
      0.000000
      0.015625
      0.021484
      0.056641
      0.009766
      0.003906
      0.000000
      0.015625
      0.0
    
    
      3
      Acer Campestre
      0.013672
      0.011719
      0.037109
      0.017578
      0.011719
      0.087891
      0.023438
      0.000000
      0.000000
      ...
      0.015625
      0.001953
      0.021484
      0.029297
      0.033203
      0.003906
      0.000000
      0.001953
      0.027344
      0.0
    
    
      4
      Acer Campestre
      0.007812
      0.009766
      0.027344
      0.025391
      0.001953
      0.005859
      0.015625
      0.000000
      0.005859
      ...
      0.023438
      0.001953
      0.021484
      0.048828
      0.056641
      0.019531
      0.000000
      0.000000
      0.013672
      0.0
    
  

5 rows × 65 columns



In [13]:

    
df.shape









    Out[13]:





(1600, 65)



In [14]:

    
df['Class'].unique()









    Out[14]:





array(['Acer Campestre', 'Acer Capillipes', 'Acer Circinatum', 'Acer Mono',
       'Acer Opalus', 'Acer Palmatum', 'Acer Pictum', 'Acer Platanoids',
       'Acer Rubrum', 'Acer Rufinerve', 'Acer Saccharinum',
       'Alnus Cordata', 'Alnus Maximowiczii', 'Alnus Rubra',
       'Alnus Sieboldiana', 'Alnus Viridis', 'Arundinaria Simonii',
       'Betula Austrosinensis', 'Betula Pendula', 'Callicarpa Bodinieri',
       'Castanea Sativa', 'Celtis Koraiensis', 'Cercis Siliquastrum',
       'Cornus Chinensis', 'Cornus Controversa', 'Cornus Macrophylla',
       'Cotinus Coggygria', 'Crataegus Monogyna', 'Cytisus Battandieri',
       'Eucalyptus Glaucescens', 'Eucalyptus Neglecta',
       'Eucalyptus Urnigera', 'Fagus Sylvatica', 'Ginkgo Biloba',
       'Ilex Aquifolium', 'Ilex Cornuta', 'Liquidambar Styraciflua',
       'Liriodendron Tulipifera', 'Lithocarpus Cleistocarpus',
       'Lithocarpus Edulis', 'Magnolia Heptapeta', 'Magnolia Salicifolia',
       'Morus Nigra', 'Olea Europaea', 'Phildelphus', 'Populus Adenopoda',
       'Populus Grandidentata', 'Populus Nigra', 'Prunus Avium',
       'Prunus X Shmittii', 'Pterocarya Stenoptera', 'Quercus Afares',
       'Quercus Agrifolia', 'Quercus Alnifolia', 'Quercus Brantii',
       'Quercus Canariensis', 'Quercus Castaneifolia', 'Quercus Cerris',
       'Quercus Chrysolepis', 'Quercus Coccifera', 'Quercus Coccinea',
       'Quercus Crassifolia', 'Quercus Crassipes', 'Quercus Dolicholepis',
       'Quercus Ellipsoidalis', 'Quercus Greggii', 'Quercus Hartwissiana',
       'Quercus Ilex', 'Quercus Imbricaria', 'Quercus Infectoria sub',
       'Quercus Kewensis', 'Quercus Nigra', 'Quercus Palustris',
       'Quercus Phellos', 'Quercus Phillyraeoides', 'Quercus Pontica',
       'Quercus Pubescens', 'Quercus Pyrenaica', 'Quercus Rhysophylla',
       'Quercus Rubra', 'Quercus Semecarpifolia', 'Quercus Shumardii',
       'Quercus Suber', 'Quercus Texana', 'Quercus Trojana',
       'Quercus Variabilis', 'Quercus Vulcanica', 'Quercus x Hispanica',
       'Quercus x Turneri', 'Rhododendron x Russellianum',
       'Salix Fragilis', 'Salix Intergra', 'Sorbus Aria', 'Tilia Oliveri',
       'Tilia Platyphyllos', 'Tilia Tomentosa', 'Ulmus Bergmanniana',
       'Viburnum Tinus', 'Viburnum x Rhytidophylloides', 'Zelkova Serrata'], dtype=object)



In [15]:

    
len(df['Class'].unique())









    Out[15]:





100



In [16]:

    
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Class'] = le.fit_transform(df['Class'])



In [17]:

    
df['Class'].unique()









    Out[17]:





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=int64)



In [18]:

    
df.head()









    Out[18]:






  
    
      
      Class
      f0
      f1
      f2
      f3
      f4
      f5
      f6
      f7
      f8
      ...
      f54
      f55
      f56
      f57
      f58
      f59
      f60
      f61
      f62
      f63
    
  
  
    
      0
      0
      0.003906
      0.003906
      0.027344
      0.033203
      0.007812
      0.017578
      0.023438
      0.005859
      0.000000
      ...
      0.011719
      0.000000
      0.005859
      0.035156
      0.027344
      0.033203
      0.001953
      0.000000
      0.017578
      0.0
    
    
      1
      0
      0.005859
      0.013672
      0.027344
      0.025391
      0.013672
      0.029297
      0.019531
      0.000000
      0.001953
      ...
      0.017578
      0.000000
      0.021484
      0.017578
      0.046875
      0.005859
      0.003906
      0.003906
      0.046875
      0.0
    
    
      2
      0
      0.011719
      0.001953
      0.027344
      0.044922
      0.017578
      0.042969
      0.023438
      0.000000
      0.003906
      ...
      0.035156
      0.000000
      0.015625
      0.021484
      0.056641
      0.009766
      0.003906
      0.000000
      0.015625
      0.0
    
    
      3
      0
      0.013672
      0.011719
      0.037109
      0.017578
      0.011719
      0.087891
      0.023438
      0.000000
      0.000000
      ...
      0.015625
      0.001953
      0.021484
      0.029297
      0.033203
      0.003906
      0.000000
      0.001953
      0.027344
      0.0
    
    
      4
      0
      0.007812
      0.009766
      0.027344
      0.025391
      0.001953
      0.005859
      0.015625
      0.000000
      0.005859
      ...
      0.023438
      0.001953
      0.021484
      0.048828
      0.056641
      0.019531
      0.000000
      0.000000
      0.013672
      0.0
    
  

5 rows × 65 columns



In [19]:

    
features = df.columns[1:]
features









    Out[19]:





Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
       'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
       'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
       'f61', 'f62', 'f63'],
      dtype='object')



In [20]:

    
X = df[features]
y = df['Class']



In [21]:

    
X.head()









    Out[21]:






  
    
      
      f0
      f1
      f2
      f3
      f4
      f5
      f6
      f7
      f8
      f9
      ...
      f54
      f55
      f56
      f57
      f58
      f59
      f60
      f61
      f62
      f63
    
  
  
    
      0
      0.003906
      0.003906
      0.027344
      0.033203
      0.007812
      0.017578
      0.023438
      0.005859
      0.000000
      0.015625
      ...
      0.011719
      0.000000
      0.005859
      0.035156
      0.027344
      0.033203
      0.001953
      0.000000
      0.017578
      0.0
    
    
      1
      0.005859
      0.013672
      0.027344
      0.025391
      0.013672
      0.029297
      0.019531
      0.000000
      0.001953
      0.021484
      ...
      0.017578
      0.000000
      0.021484
      0.017578
      0.046875
      0.005859
      0.003906
      0.003906
      0.046875
      0.0
    
    
      2
      0.011719
      0.001953
      0.027344
      0.044922
      0.017578
      0.042969
      0.023438
      0.000000
      0.003906
      0.019531
      ...
      0.035156
      0.000000
      0.015625
      0.021484
      0.056641
      0.009766
      0.003906
      0.000000
      0.015625
      0.0
    
    
      3
      0.013672
      0.011719
      0.037109
      0.017578
      0.011719
      0.087891
      0.023438
      0.000000
      0.000000
      0.027344
      ...
      0.015625
      0.001953
      0.021484
      0.029297
      0.033203
      0.003906
      0.000000
      0.001953
      0.027344
      0.0
    
    
      4
      0.007812
      0.009766
      0.027344
      0.025391
      0.001953
      0.005859
      0.015625
      0.000000
      0.005859
      0.017578
      ...
      0.023438
      0.001953
      0.021484
      0.048828
      0.056641
      0.019531
      0.000000
      0.000000
      0.013672
      0.0
    
  

5 rows × 64 columns



In [22]:

    
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)



In [24]:

    
print (X_train.shape, y_train.shape)









    



(960, 64) (960,)

Unsupervised Learning

PCA



In [34]:

    
y.unique()









    Out[34]:





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=int64)



In [35]:

    
len(features)









    Out[35]:





64



In [38]:

    
# Apply PCA with the same number of dimensions as variables in the dataset
from sklearn.decomposition import PCA
pca = PCA(n_components=64)
pca.fit(X)

# Print the components and the amount of variance in the data contained in each dimension
print(pca.components_)
print(pca.explained_variance_ratio_)









    



[[ 0.17301085  0.3678857  -0.0376574  ..., -0.04515881  0.10784595
  -0.02792765]
 [ 0.06860826  0.17120937 -0.16743028 ...,  0.05114625 -0.04738932
   0.00340058]
 [-0.01217028 -0.08481612  0.30296649 ..., -0.04069022  0.17427581
  -0.03918934]
 ..., 
 [-0.00740531 -0.00196931 -0.00802288 ..., -0.00735558  0.00890972
   0.04235162]
 [ 0.0151684   0.01028616  0.01684166 ...,  0.05617535  0.01335735
  -0.00934287]
 [ 0.1249996   0.12499963  0.1249996  ...,  0.12499625  0.12500077
   0.12500567]]
[  3.66346160e-01   1.36479827e-01   8.76136880e-02   6.76087515e-02
   5.62833251e-02   4.56950262e-02   2.62293511e-02   2.23588970e-02
   1.85408968e-02   1.68000379e-02   1.51209237e-02   1.19416550e-02
   9.62145182e-03   8.87215304e-03   8.70710022e-03   7.38789680e-03
   6.71526565e-03   6.42370588e-03   5.45940047e-03   4.81032134e-03
   4.16110779e-03   4.04450437e-03   3.98474616e-03   3.56179763e-03
   3.37395149e-03   3.11290968e-03   2.93953519e-03   2.72151214e-03
   2.66239203e-03   2.57375991e-03   2.44332593e-03   2.31217010e-03
   2.11575025e-03   2.09009210e-03   2.01793326e-03   1.93257495e-03
   1.84261074e-03   1.79722687e-03   1.77056628e-03   1.73038935e-03
   1.64383197e-03   1.55318733e-03   1.46506434e-03   1.39524503e-03
   1.34309348e-03   1.27063003e-03   1.03304595e-03   9.83972274e-04
   9.72012625e-04   8.30449484e-04   7.88002854e-04   6.93523108e-04
   6.00723467e-04   5.70686500e-04   5.22928128e-04   4.52364252e-04
   3.92603544e-04   3.30888609e-04   3.03654919e-04   2.54559631e-04
   1.82047138e-04   1.48132720e-04   6.46620494e-05   5.06768097e-12]



In [39]:

    
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(list(pca.explained_variance_ratio_),'-o')
plt.title('Explained variance ratio as function of PCA components')
plt.ylabel('Explained variance ratio')
plt.xlabel('Component')
plt.show()



In [45]:

    
# First we reduce the data to two dimensions using PCA to capture variation
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X)
print(reduced_data[:10])  # print upto 10 elements









    



[[-0.03307489 -0.02892331]
 [-0.02422727 -0.01839179]
 [-0.02629431 -0.03441262]
 [ 0.00831379  0.00113884]
 [ 0.00117595 -0.05432339]
 [-0.00205065 -0.02189747]
 [ 0.00664743 -0.0258008 ]
 [-0.02080581 -0.00804307]
 [-0.0357074   0.00314164]
 [-0.02317419  0.00840398]]



In [47]:

    
kmeans = KMeans(n_clusters=100)
clusters = kmeans.fit(reduced_data)
print(clusters)









    



KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=100, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)



In [48]:

    
# Plot the decision boundary by building a mesh grid to populate a graph.
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
hx = (x_max-x_min)/1000.
hy = (y_max-y_min)/1000.
xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy))

# Obtain labels for each point in mesh. Use last trained model.
Z = clusters.predict(np.c_[xx.ravel(), yy.ravel()])



In [49]:

    
# Find the centroids for KMeans or the cluster means for GMM 

centroids = kmeans.cluster_centers_
print('*** K MEANS CENTROIDS ***')
print(centroids)

# TRANSFORM DATA BACK TO ORIGINAL SPACE FOR ANSWERING 7
print('*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***')
print(pca.inverse_transform(centroids))









    



*** K MEANS CENTROIDS ***
[[-0.00569156 -0.02757334]
 [ 0.13246213 -0.01621052]
 [-0.09288891  0.03512336]
 [-0.04629578 -0.14273808]
 [ 0.16744926  0.05257156]
 [ 0.07064058 -0.02604673]
 [-0.02826312  0.01388355]
 [-0.05375892 -0.08335535]
 [-0.11115126  0.05581906]
 [ 0.01562468 -0.07296821]
 [-0.01047396  0.02356011]
 [ 0.11544097  0.06833212]
 [ 0.01681328 -0.00510339]
 [-0.07041002 -0.00715404]
 [ 0.1227513   0.02325   ]
 [ 0.03765457  0.02418866]
 [-0.0727005   0.04570051]
 [ 0.23678466  0.02092281]
 [ 0.09965405 -0.07586469]
 [ 0.02956955 -0.14766104]
 [ 0.24783442  0.09124872]
 [-0.05173007 -0.01858108]
 [-0.07509212 -0.16741198]
 [ 0.15820097  0.02933633]
 [ 0.17652355 -0.08188853]
 [ 0.08251682  0.03693327]
 [-0.07078136 -0.13687759]
 [ 0.19897381  0.06630531]
 [-0.02931185 -0.06391228]
 [ 0.09146325 -0.04824824]
 [-0.08578384  0.01404901]
 [-0.02962707 -0.11551206]
 [ 0.19834241 -0.03538007]
 [-0.05972279  0.03032355]
 [ 0.08670001 -0.00606107]
 [-0.03116005  0.04406629]
 [ 0.02648243 -0.05101348]
 [-0.10810175  0.02162109]
 [-0.10345345  0.07732651]
 [-0.05484872 -0.17262396]
 [-0.11811638  0.07819233]
 [ 0.00570303 -0.1432835 ]
 [-0.01907073 -0.0475412 ]
 [ 0.14628249  0.01832428]
 [ 0.05575008 -0.05731427]
 [ 0.11206784 -0.02420592]
 [ 0.02723779 -0.11093354]
 [ 0.10353602  0.01005479]
 [-0.0224084  -0.01092377]
 [-0.09914013  0.06366138]
 [ 0.03007613 -0.02285616]
 [-0.09979745  0.00107765]
 [-0.08069197  0.05916232]
 [-0.05493688  0.00061032]
 [-0.00936725 -0.07803598]
 [-0.0801698  -0.02510252]
 [ 0.05976957  0.01190185]
 [ 0.05585779 -0.01290171]
 [-0.0017517   0.04210121]
 [ 0.13281497  0.05043614]
 [ 0.18931821  0.02747014]
 [ 0.22638096  0.07670289]
 [-0.06116483 -0.11844676]
 [-0.10376601  0.04464   ]
 [-0.11810946  0.04675272]
 [-0.07741126  0.02926753]
 [-0.03989306  0.02506544]
 [-0.03842906 -0.03845785]
 [-0.12118508  0.06150249]
 [ 0.04853543 -0.08333611]
 [ 0.00320653 -0.04752446]
 [ 0.00705696  0.01755481]
 [ 0.13125472 -0.04330547]
 [ 0.15408349  0.04558488]
 [-0.05532449  0.05504591]
 [ 0.03576639 -0.00278353]
 [-0.07983163 -0.07187959]
 [ 0.18520214  0.05829571]
 [-0.06727929  0.01146632]
 [-0.01335953  0.00502683]
 [ 0.00114023 -0.172542  ]
 [ 0.11054053  0.04174668]
 [ 0.15029777  0.07876001]
 [-0.03478568 -0.09472081]
 [-0.0701767  -0.19748871]
 [ 0.05397147  0.04461728]
 [ 0.15789829 -0.00141685]
 [-0.04004061  0.00684292]
 [-0.08793461  0.04723146]
 [-0.02320017 -0.0277043 ]
 [ 0.21870363  0.05245085]
 [-0.03890206 -0.01079456]
 [ 0.12431625  0.00142029]
 [ 0.09176176 -0.02673565]
 [ 0.00026089 -0.00786204]
 [ 0.04546378 -0.03759898]
 [ 0.01077029 -0.0266246 ]
 [ 0.02483525  0.01424494]
 [-0.0546426  -0.04536985]
 [ 0.17367441  0.03448724]]
*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***
[[-0.00569156 -0.02757334]
 [ 0.13246213 -0.01621052]
 [-0.09288891  0.03512336]
 [-0.04629578 -0.14273808]
 [ 0.16744926  0.05257156]
 [ 0.07064058 -0.02604673]
 [-0.02826312  0.01388355]
 [-0.05375892 -0.08335535]
 [-0.11115126  0.05581906]
 [ 0.01562468 -0.07296821]
 [-0.01047396  0.02356011]
 [ 0.11544097  0.06833212]
 [ 0.01681328 -0.00510339]
 [-0.07041002 -0.00715404]
 [ 0.1227513   0.02325   ]
 [ 0.03765457  0.02418866]
 [-0.0727005   0.04570051]
 [ 0.23678466  0.02092281]
 [ 0.09965405 -0.07586469]
 [ 0.02956955 -0.14766104]
 [ 0.24783442  0.09124872]
 [-0.05173007 -0.01858108]
 [-0.07509212 -0.16741198]
 [ 0.15820097  0.02933633]
 [ 0.17652355 -0.08188853]
 [ 0.08251682  0.03693327]
 [-0.07078136 -0.13687759]
 [ 0.19897381  0.06630531]
 [-0.02931185 -0.06391228]
 [ 0.09146325 -0.04824824]
 [-0.08578384  0.01404901]
 [-0.02962707 -0.11551206]
 [ 0.19834241 -0.03538007]
 [-0.05972279  0.03032355]
 [ 0.08670001 -0.00606107]
 [-0.03116005  0.04406629]
 [ 0.02648243 -0.05101348]
 [-0.10810175  0.02162109]
 [-0.10345345  0.07732651]
 [-0.05484872 -0.17262396]
 [-0.11811638  0.07819233]
 [ 0.00570303 -0.1432835 ]
 [-0.01907073 -0.0475412 ]
 [ 0.14628249  0.01832428]
 [ 0.05575008 -0.05731427]
 [ 0.11206784 -0.02420592]
 [ 0.02723779 -0.11093354]
 [ 0.10353602  0.01005479]
 [-0.0224084  -0.01092377]
 [-0.09914013  0.06366138]
 [ 0.03007613 -0.02285616]
 [-0.09979745  0.00107765]
 [-0.08069197  0.05916232]
 [-0.05493688  0.00061032]
 [-0.00936725 -0.07803598]
 [-0.0801698  -0.02510252]
 [ 0.05976957  0.01190185]
 [ 0.05585779 -0.01290171]
 [-0.0017517   0.04210121]
 [ 0.13281497  0.05043614]
 [ 0.18931821  0.02747014]
 [ 0.22638096  0.07670289]
 [-0.06116483 -0.11844676]
 [-0.10376601  0.04464   ]
 [-0.11810946  0.04675272]
 [-0.07741126  0.02926753]
 [-0.03989306  0.02506544]
 [-0.03842906 -0.03845785]
 [-0.12118508  0.06150249]
 [ 0.04853543 -0.08333611]
 [ 0.00320653 -0.04752446]
 [ 0.00705696  0.01755481]
 [ 0.13125472 -0.04330547]
 [ 0.15408349  0.04558488]
 [-0.05532449  0.05504591]
 [ 0.03576639 -0.00278353]
 [-0.07983163 -0.07187959]
 [ 0.18520214  0.05829571]
 [-0.06727929  0.01146632]
 [-0.01335953  0.00502683]
 [ 0.00114023 -0.172542  ]
 [ 0.11054053  0.04174668]
 [ 0.15029777  0.07876001]
 [-0.03478568 -0.09472081]
 [-0.0701767  -0.19748871]
 [ 0.05397147  0.04461728]
 [ 0.15789829 -0.00141685]
 [-0.04004061  0.00684292]
 [-0.08793461  0.04723146]
 [-0.02320017 -0.0277043 ]
 [ 0.21870363  0.05245085]
 [-0.03890206 -0.01079456]
 [ 0.12431625  0.00142029]
 [ 0.09176176 -0.02673565]
 [ 0.00026089 -0.00786204]
 [ 0.04546378 -0.03759898]
 [ 0.01077029 -0.0266246 ]
 [ 0.02483525  0.01424494]
 [-0.0546426  -0.04536985]
 [ 0.17367441  0.03448724]]



In [50]:

    
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('Clustering on the seeds dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

Applying agglomerative clustering via scikit-learn



In [52]:

    
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=100, affinity='euclidean', linkage='complete')
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)









    



Cluster labels: [ 9 33 33 ..., 47 47 47]



In [53]:

    
from sklearn.cross_validation import train_test_split
X = df[features]
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.25, random_state=42)

K Means



In [55]:

    
from sklearn import cluster
clf = cluster.KMeans(init='k-means++', n_clusters=100, random_state=5)
clf.fit(X_train)
print clf.labels_.shape
print clf.labels_









    



(1200L,)
[41  7 41 ..., 98 70 58]



In [56]:

    
# Predict clusters on testing data
y_pred = clf.predict(X_test)



In [57]:

    
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)









    



Addjusted rand score:0.46
Homogeneity score:0.84 
Completeness score: 0.87 
Confusion matrix
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

Affinity Propogation



In [58]:

    
# Affinity propagation
aff = cluster.AffinityPropagation()
aff.fit(X_train)
print aff.cluster_centers_indices_.shape









    



(66L,)



In [59]:

    
y_pred = aff.predict(X_test)



In [60]:

    
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)









    



Addjusted rand score:0.37
Homogeneity score:0.77 
Completeness score: 0.87 
Confusion matrix
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

MeanShift



In [61]:

    
ms = cluster.MeanShift()
ms.fit(X_train)









    Out[61]:





MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)



In [62]:

    
y_pred = ms.predict(X_test)



In [63]:

    
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)









    



Addjusted rand score:0.00047
Homogeneity score:0.015 
Completeness score: 1.0 
Confusion matrix
[[1 0 0 ..., 0 0 0]
 [3 0 0 ..., 0 0 0]
 [3 0 0 ..., 0 0 0]
 ..., 
 [3 0 0 ..., 0 0 0]
 [3 0 0 ..., 0 0 0]
 [4 0 0 ..., 0 0 0]]

Mixture of Guassian Models



In [65]:

    
from sklearn import mixture

# Define a heldout dataset to estimate covariance type
X_train_heldout, X_test_heldout, y_train_heldout, y_test_heldout = train_test_split(
        X_train, y_train,test_size=0.25, random_state=42)
for covariance_type in ['spherical','tied','diag','full']:
    gm=mixture.GMM(n_components=100, covariance_type=covariance_type, random_state=42, n_init=5)
    gm.fit(X_train_heldout)
    y_pred=gm.predict(X_test_heldout)
    print "Adjusted rand score for covariance={}:{:.2}".format(covariance_type, 
                                                               metrics.adjusted_rand_score(y_test_heldout, y_pred))









    



Adjusted rand score for covariance=spherical:0.2
Adjusted rand score for covariance=tied:0.24
Adjusted rand score for covariance=diag:0.2
Adjusted rand score for covariance=full:0.082



In [67]:

    
X = df[features].values
y= df['Class'].values
pca = PCA(n_components=2)
X = pca.fit_transform(X)



In [77]:

    
c = []
from matplotlib.pyplot import cm 
n=100
color=iter(cm.rainbow(np.linspace(0,1,n)))
for i in range(n):
    c.append(next(color))



In [80]:

    
c[99]









    Out[80]:





array([  1.00000000e+00,   1.22464680e-16,   6.12323400e-17,
         1.00000000e+00])



In [94]:

    
n = 100
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

km = KMeans(n_clusters= n , random_state=0)
y_km = km.fit_predict(X)

for i in range(n):
    ax1.scatter(X[y_km==i,0], X[y_km==i,1], c=c[i], marker='o', s=40, label='cluster{}'.format(i))
ax1.set_title('K-means clustering')

ac = AgglomerativeClustering(n_clusters=100, affinity='euclidean', linkage='complete')
y_ac = ac.fit_predict(X)
for i in range(n):
    ax2.scatter(X[y_ac==i,0], X[y_ac==i,1], c=c[i], marker='o', s=40, label='cluster{}'.format(i))
ax2.set_title('Agglomerative clustering')

# Put a legend below current axis
plt.legend(loc='upper center', bbox_to_anchor=(0, -0.05),
          fancybox=True, shadow=True, ncol=10)
    
plt.tight_layout()
#plt.savefig('./figures/kmeans_and_ac.png', dpi=300)
plt.show()

Classification



In [40]:

    
import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report



In [25]:

    
X = df[features]
y = df['Class']



In [26]:

    
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)



In [27]:

    
print (X_train.shape, y_train.shape)









    



(960, 64) (960,)

Decision Tree accuracy and time elapsed caculation



In [28]:

    
t0=time()
print ("DecisionTree")

dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)

clf_dt=dt.fit(X_train,y_train)

print ("Acurracy: ", clf_dt.score(X_test,y_test))
t1=time()
print ("time elapsed: ", t1-t0)









    



DecisionTree
Acurracy:  0.3890625
time elapsed:  0.08000469207763672



In [29]:

    
tt0=time()
print ("cross result========")
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print (scores)
print (scores.mean())
tt1=time()
print ("time elapsed: ", tt1-tt0)









    



cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  0.5090291500091553



In [107]:

    
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])

parameters = {
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)

print classification_report(y_test, predictions)









    



[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   38.7s finished






    



Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best score: 0.372
Best parameters set:
	clf__max_depth: 50
	clf__min_samples_leaf: 1
	clf__min_samples_split: 5
             precision    recall  f1-score   support

          0       0.33      0.44      0.38         9
          1       0.33      0.14      0.20         7
          2       0.83      0.71      0.77         7
          3       0.40      0.20      0.27        10
          4       0.29      0.29      0.29         7
          5       0.80      0.80      0.80         5
          6       0.00      0.00      0.00         4
          7       0.67      0.50      0.57         4
          8       0.17      0.40      0.24         5
          9       1.00      0.25      0.40         4
         10       1.00      1.00      1.00         5
         11       0.71      0.62      0.67         8
         12       0.38      0.43      0.40         7
         13       0.42      0.83      0.56         6
         14       0.50      0.60      0.55         5
         15       0.50      0.33      0.40         9
         16       0.62      0.83      0.71         6
         17       0.50      0.25      0.33         4
         18       1.00      0.33      0.50         6
         19       0.38      0.38      0.38         8
         20       0.60      0.50      0.55         6
         21       0.25      0.50      0.33         4
         22       0.64      1.00      0.78         7
         23       0.00      0.00      0.00         3
         24       0.18      0.75      0.29         4
         25       0.00      0.00      0.00         7
         26       0.75      0.38      0.50         8
         27       0.67      0.18      0.29        11
         28       0.50      0.40      0.44         5
         29       0.83      0.42      0.56        12
         30       0.10      0.12      0.11         8
         31       0.29      0.40      0.33         5
         32       0.43      0.33      0.38         9
         33       0.44      0.78      0.56         9
         34       0.54      1.00      0.70         7
         35       0.55      0.60      0.57        10
         36       0.62      0.89      0.73         9
         37       0.20      0.33      0.25         3
         38       0.50      0.50      0.50         6
         39       0.25      0.50      0.33         2
         40       0.50      0.11      0.18         9
         41       0.10      0.17      0.12         6
         42       0.40      0.57      0.47         7
         43       1.00      0.83      0.91         6
         44       0.25      0.17      0.20         6
         45       0.50      0.43      0.46         7
         46       0.80      0.57      0.67         7
         47       0.20      0.17      0.18         6
         48       0.22      0.50      0.31         4
         49       0.53      1.00      0.70         8
         50       0.22      0.33      0.27         6
         51       0.67      0.57      0.62         7
         52       0.50      1.00      0.67         5
         53       0.25      0.33      0.29         3
         54       0.50      0.25      0.33         4
         55       0.29      0.40      0.33         5
         56       0.10      0.17      0.12         6
         57       0.50      0.29      0.36         7
         58       0.43      0.38      0.40         8
         59       0.56      0.71      0.63         7
         60       0.00      0.00      0.00        10
         61       0.00      0.00      0.00         5
         62       0.90      1.00      0.95         9
         63       0.43      0.50      0.46         6
         64       0.17      0.14      0.15         7
         65       0.33      0.20      0.25         5
         66       0.38      1.00      0.55         3
         67       0.67      0.86      0.75         7
         68       0.14      0.29      0.19         7
         69       0.75      0.43      0.55         7
         70       0.00      0.00      0.00         7
         71       0.25      0.25      0.25         4
         72       0.50      0.67      0.57         3
         73       0.00      0.00      0.00        11
         74       0.71      0.56      0.63         9
         75       0.33      0.17      0.22         6
         76       0.60      0.43      0.50         7
         77       0.40      0.33      0.36         6
         78       0.25      0.20      0.22         5
         79       0.71      0.71      0.71         7
         80       0.50      0.12      0.20         8
         81       0.50      0.40      0.44         5
         82       0.40      0.50      0.44         4
         83       0.33      0.50      0.40         6
         84       0.00      0.00      0.00         8
         85       0.33      0.33      0.33         3
         86       0.57      0.67      0.62         6
         87       0.50      0.29      0.36         7
         88       0.67      0.29      0.40         7
         89       1.00      0.50      0.67         6
         90       0.67      0.67      0.67         9
         91       0.50      0.17      0.25         6
         92       0.00      0.00      0.00         3
         93       0.40      0.67      0.50         6
         94       0.67      0.25      0.36         8
         95       0.50      0.25      0.33         8
         96       0.20      0.33      0.25         6
         97       0.00      0.00      0.00         6
         98       0.00      0.00      0.00         2
         99       1.00      0.30      0.46        10

avg / total       0.46      0.42      0.41       640







    



C:\Miniconda2\lib\site-packages\sklearn\metrics\classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Random Forest accuracy and time elapsed caculation



In [30]:

    
t2=time()
print ("RandomForest")
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
print ("Acurracy: ", clf_rf.score(X_test,y_test))
t3=time()
print ("time elapsed: ", t3-t2)









    



RandomForest
Acurracy:  0.778125
time elapsed:  0.8410482406616211



In [31]:

    
tt0=time()
print ("cross result========")
scores = cross_validation.cross_val_score(rf, X,y, cv=5)
print (scores)
print (scores.mean())
tt1=time()
print ("time elapsed: ", tt1-tt0)









    



cross result========
[ 0.82        0.86666667  0.78666667  0.84666667  0.83      ]
0.83
time elapsed:  3.5492029190063477



In [112]:

    
pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])

parameters = {
    'clf__n_estimators': (5, 25, 50, 100),
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)









    



[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.5min finished






    



Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best score: 0.797
Best parameters set:
	clf__max_depth: 25
	clf__min_samples_leaf: 1
	clf__min_samples_split: 1
	clf__n_estimators: 100
Accuracy: 0.7625
             precision    recall  f1-score   support

          0       0.71      0.56      0.63         9
          1       0.71      0.71      0.71         7
          2       1.00      1.00      1.00         7
          3       1.00      0.70      0.82        10
          4       0.88      1.00      0.93         7
          5       1.00      1.00      1.00         5
          6       0.67      1.00      0.80         4
          7       1.00      1.00      1.00         4
          8       0.71      1.00      0.83         5
          9       0.50      1.00      0.67         4
         10       1.00      1.00      1.00         5
         11       1.00      0.75      0.86         8
         12       0.70      1.00      0.82         7
         13       0.75      1.00      0.86         6
         14       0.80      0.80      0.80         5
         15       1.00      0.56      0.71         9
         16       0.75      1.00      0.86         6
         17       0.50      0.75      0.60         4
         18       0.83      0.83      0.83         6
         19       0.86      0.75      0.80         8
         20       0.86      1.00      0.92         6
         21       1.00      0.75      0.86         4
         22       0.78      1.00      0.88         7
         23       0.12      0.67      0.21         3
         24       0.36      1.00      0.53         4
         25       0.33      0.14      0.20         7
         26       1.00      0.75      0.86         8
         27       1.00      0.64      0.78        11
         28       0.62      1.00      0.77         5
         29       1.00      0.50      0.67        12
         30       0.33      0.25      0.29         8
         31       1.00      0.60      0.75         5
         32       1.00      0.89      0.94         9
         33       1.00      0.56      0.71         9
         34       0.88      1.00      0.93         7
         35       1.00      1.00      1.00        10
         36       1.00      1.00      1.00         9
         37       0.75      1.00      0.86         3
         38       0.50      0.67      0.57         6
         39       0.25      0.50      0.33         2
         40       0.80      0.44      0.57         9
         41       0.38      0.50      0.43         6
         42       1.00      0.86      0.92         7
         43       0.86      1.00      0.92         6
         44       1.00      0.67      0.80         6
         45       0.88      1.00      0.93         7
         46       1.00      0.86      0.92         7
         47       0.86      1.00      0.92         6
         48       0.67      1.00      0.80         4
         49       0.78      0.88      0.82         8
         50       0.86      1.00      0.92         6
         51       1.00      0.71      0.83         7
         52       0.83      1.00      0.91         5
         53       0.50      1.00      0.67         3
         54       0.40      1.00      0.57         4
         55       1.00      0.80      0.89         5
         56       0.67      0.33      0.44         6
         57       0.83      0.71      0.77         7
         58       0.50      0.12      0.20         8
         59       0.70      1.00      0.82         7
         60       1.00      0.30      0.46        10
         61       0.50      0.80      0.62         5
         62       1.00      1.00      1.00         9
         63       0.86      1.00      0.92         6
         64       1.00      0.43      0.60         7
         65       0.80      0.80      0.80         5
         66       0.60      1.00      0.75         3
         67       1.00      1.00      1.00         7
         68       0.38      0.71      0.50         7
         69       0.80      0.57      0.67         7
         70       0.60      0.43      0.50         7
         71       0.57      1.00      0.73         4
         72       0.60      1.00      0.75         3
         73       1.00      0.09      0.17        11
         74       1.00      0.89      0.94         9
         75       1.00      0.33      0.50         6
         76       1.00      0.86      0.92         7
         77       0.86      1.00      0.92         6
         78       0.25      0.20      0.22         5
         79       1.00      1.00      1.00         7
         80       1.00      0.50      0.67         8
         81       0.71      1.00      0.83         5
         82       0.57      1.00      0.73         4
         83       0.46      1.00      0.63         6
         84       1.00      0.75      0.86         8
         85       0.75      1.00      0.86         3
         86       1.00      1.00      1.00         6
         87       1.00      0.71      0.83         7
         88       1.00      1.00      1.00         7
         89       0.75      1.00      0.86         6
         90       0.90      1.00      0.95         9
         91       1.00      0.83      0.91         6
         92       0.40      0.67      0.50         3
         93       0.71      0.83      0.77         6
         94       0.71      0.62      0.67         8
         95       1.00      0.62      0.77         8
         96       0.67      0.33      0.44         6
         97       0.67      0.33      0.44         6
         98       0.67      1.00      0.80         2
         99       1.00      1.00      1.00        10

avg / total       0.82      0.76      0.76       640

Naive Bayes accuracy and time elapsed caculation



In [32]:

    
t4=time()
print ("NaiveBayes")
nb = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
print ("Acurracy: ", clf_nb.score(X_test,y_test))
t5=time()
print ("time elapsed: ", t5-t4)









    



NaiveBayes
Acurracy:  0.36875
time elapsed:  0.08200478553771973



In [33]:

    
tt0=time()
print ("cross result========")
scores = cross_validation.cross_val_score(nb, X,y, cv=5)
print (scores)
print (scores.mean())
tt1=time()
print ("time elapsed: ", tt1-tt0)









    



cross result========
[ 0.4725      0.43666667  0.43333333  0.48        0.44666667]
0.453833333333
time elapsed:  0.11600661277770996

KNN accuracy and time elapsed caculation



In [34]:

    
t6=time()
print ("KNN")
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier()
clf_knn=knn.fit(X_train, y_train)
print ("Acurracy: ", clf_knn.score(X_test,y_test) )
t7=time()
print ("time elapsed: ", t7-t6)









    



KNN
Acurracy:  0.71875
time elapsed:  0.09100508689880371



In [35]:

    
tt0=time()
print ("cross result========")
scores = cross_validation.cross_val_score(knn, X,y, cv=5)
print (scores)
print (scores.mean())
tt1=time()
print ("time elapsed: ", tt1-tt0)









    



cross result========
[ 0.735       0.77333333  0.74666667  0.73333333  0.77333333]
0.752333333333
time elapsed:  0.38802218437194824

SVM accuracy and time elapsed caculation



In [36]:

    
t7=time()
print ("SVM")

svc = SVC()
clf_svc=svc.fit(X_train, y_train)
print ("Acurracy: ", clf_svc.score(X_test,y_test) )
t8=time()
print ("time elapsed: ", t8-t7)









    



SVM
Acurracy:  0.00625
time elapsed:  0.5790331363677979



In [37]:

    
tt0=time()
print ("cross result========")
scores = cross_validation.cross_val_score(svc, X,y, cv=5)
print (scores)
print (scores.mean())
tt1=time()
print ("time elapsed: ", tt1-tt0)









    



cross result========
[ 0.7175      0.74        0.70333333  0.68666667  0.68      ]
0.7055
time elapsed:  3.7612152099609375



In [121]:

    
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)









    



Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best score: 0.231
Best parameters set:
	C: 10
	kernel: 'linear'
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         9
          1       0.00      0.00      0.00         7
          2       0.00      0.00      0.00         7
          3       0.00      0.00      0.00        10
          4       0.00      0.00      0.00         7
          5       1.00      1.00      1.00         5
          6       0.40      1.00      0.57         4
          7       0.36      1.00      0.53         4
          8       0.00      0.00      0.00         5
          9       0.04      0.25      0.07         4
         10       1.00      1.00      1.00         5
         11       0.00      0.00      0.00         8
         12       0.00      0.00      0.00         7
         13       0.00      0.00      0.00         6
         14       1.00      0.40      0.57         5
         15       0.00      0.00      0.00         9
         16       1.00      0.83      0.91         6
         17       0.14      0.75      0.24         4
         18       0.00      0.00      0.00         6
         19       0.00      0.00      0.00         8
         20       0.00      0.00      0.00         6
         21       0.00      0.00      0.00         4
         22       0.00      0.00      0.00         7
         23       0.04      1.00      0.08         3
         24       0.00      0.00      0.00         4
         25       0.00      0.00      0.00         7
         26       0.00      0.00      0.00         8
         27       0.00      0.00      0.00        11
         28       0.00      0.00      0.00         5
         29       0.00      0.00      0.00        12
         30       0.00      0.00      0.00         8
         31       0.00      0.00      0.00         5
         32       0.00      0.00      0.00         9
         33       0.00      0.00      0.00         9
         34       1.00      0.57      0.73         7
         35       0.00      0.00      0.00        10
         36       1.00      0.89      0.94         9
         37       0.20      1.00      0.33         3
         38       0.00      0.00      0.00         6
         39       0.13      1.00      0.24         2
         40       0.00      0.00      0.00         9
         41       0.00      0.00      0.00         6
         42       0.00      0.00      0.00         7
         43       0.46      1.00      0.63         6
         44       0.00      0.00      0.00         6
         45       0.00      0.00      0.00         7
         46       1.00      0.71      0.83         7
         47       0.00      0.00      0.00         6
         48       0.30      0.75      0.43         4
         49       0.00      0.00      0.00         8
         50       0.00      0.00      0.00         6
         51       0.00      0.00      0.00         7
         52       0.00      0.00      0.00         5
         53       0.05      1.00      0.09         3
         54       0.17      1.00      0.29         4
         55       0.00      0.00      0.00         5
         56       0.00      0.00      0.00         6
         57       0.00      0.00      0.00         7
         58       0.00      0.00      0.00         8
         59       0.00      0.00      0.00         7
         60       0.00      0.00      0.00        10
         61       0.25      0.20      0.22         5
         62       0.00      0.00      0.00         9
         63       0.00      0.00      0.00         6
         64       0.00      0.00      0.00         7
         65       0.33      0.80      0.47         5
         66       0.18      1.00      0.30         3
         67       0.00      0.00      0.00         7
         68       0.00      0.00      0.00         7
         69       0.00      0.00      0.00         7
         70       0.00      0.00      0.00         7
         71       0.11      0.75      0.19         4
         72       0.16      1.00      0.27         3
         73       0.00      0.00      0.00        11
         74       0.00      0.00      0.00         9
         75       0.00      0.00      0.00         6
         76       0.00      0.00      0.00         7
         77       0.46      1.00      0.63         6
         78       0.00      0.00      0.00         5
         79       1.00      0.14      0.25         7
         80       0.00      0.00      0.00         8
         81       0.40      0.80      0.53         5
         82       0.06      0.75      0.12         4
         83       0.00      0.00      0.00         6
         84       0.00      0.00      0.00         8
         85       0.14      1.00      0.25         3
         86       1.00      0.50      0.67         6
         87       0.00      0.00      0.00         7
         88       0.00      0.00      0.00         7
         89       0.00      0.00      0.00         6
         90       0.00      0.00      0.00         9
         91       0.00      0.00      0.00         6
         92       0.04      1.00      0.07         3
         93       0.00      0.00      0.00         6
         94       0.00      0.00      0.00         8
         95       0.00      0.00      0.00         8
         96       0.00      0.00      0.00         6
         97       0.00      0.00      0.00         6
         98       0.02      0.50      0.04         2
         99       0.00      0.00      0.00        10

avg / total       0.12      0.17      0.11       640







    



[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   18.5s finished



In [122]:

    
pipeline = Pipeline([
    ('clf', SVC(kernel='rbf', gamma=0.01, C=100))
])

parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)









    



[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   31.1s finished






    



Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.698
Best parameters set:
	clf__C: 30
	clf__gamma: 1
             precision    recall  f1-score   support

          0       0.83      0.56      0.67         9
          1       0.88      1.00      0.93         7
          2       1.00      1.00      1.00         7
          3       1.00      0.20      0.33        10
          4       1.00      1.00      1.00         7
          5       1.00      1.00      1.00         5
          6       0.67      1.00      0.80         4
          7       1.00      1.00      1.00         4
          8       0.71      1.00      0.83         5
          9       0.57      1.00      0.73         4
         10       1.00      1.00      1.00         5
         11       1.00      0.75      0.86         8
         12       0.43      0.86      0.57         7
         13       1.00      1.00      1.00         6
         14       1.00      1.00      1.00         5
         15       0.00      0.00      0.00         9
         16       1.00      1.00      1.00         6
         17       0.60      0.75      0.67         4
         18       0.86      1.00      0.92         6
         19       0.83      0.62      0.71         8
         20       1.00      1.00      1.00         6
         21       1.00      1.00      1.00         4
         22       1.00      1.00      1.00         7
         23       0.16      1.00      0.27         3
         24       0.12      0.75      0.20         4
         25       1.00      0.29      0.44         7
         26       0.50      0.12      0.20         8
         27       1.00      0.73      0.84        11
         28       0.38      0.60      0.46         5
         29       1.00      0.08      0.15        12
         30       0.00      0.00      0.00         8
         31       0.30      0.60      0.40         5
         32       0.89      0.89      0.89         9
         33       1.00      0.89      0.94         9
         34       1.00      1.00      1.00         7
         35       1.00      0.90      0.95        10
         36       1.00      0.89      0.94         9
         37       0.75      1.00      0.86         3
         38       0.67      0.67      0.67         6
         39       0.20      0.50      0.29         2
         40       0.00      0.00      0.00         9
         41       1.00      0.17      0.29         6
         42       0.88      1.00      0.93         7
         43       1.00      1.00      1.00         6
         44       0.57      0.67      0.62         6
         45       0.88      1.00      0.93         7
         46       1.00      1.00      1.00         7
         47       0.75      1.00      0.86         6
         48       1.00      1.00      1.00         4
         49       1.00      1.00      1.00         8
         50       0.75      0.50      0.60         6
         51       1.00      0.71      0.83         7
         52       1.00      0.80      0.89         5
         53       0.60      1.00      0.75         3
         54       0.67      1.00      0.80         4
         55       1.00      1.00      1.00         5
         56       1.00      0.67      0.80         6
         57       0.75      0.86      0.80         7
         58       0.25      0.25      0.25         8
         59       0.88      1.00      0.93         7
         60       0.00      0.00      0.00        10
         61       0.50      0.80      0.62         5
         62       0.69      1.00      0.82         9
         63       1.00      1.00      1.00         6
         64       1.00      1.00      1.00         7
         65       0.80      0.80      0.80         5
         66       0.75      1.00      0.86         3
         67       0.40      0.29      0.33         7
         68       0.40      0.29      0.33         7
         69       0.83      0.71      0.77         7
         70       0.50      0.43      0.46         7
         71       0.60      0.75      0.67         4
         72       1.00      0.67      0.80         3
         73       0.00      0.00      0.00        11
         74       0.89      0.89      0.89         9
         75       1.00      0.50      0.67         6
         76       0.86      0.86      0.86         7
         77       0.86      1.00      0.92         6
         78       0.36      1.00      0.53         5
         79       1.00      1.00      1.00         7
         80       1.00      0.50      0.67         8
         81       0.83      1.00      0.91         5
         82       0.67      1.00      0.80         4
         83       0.40      1.00      0.57         6
         84       1.00      1.00      1.00         8
         85       0.75      1.00      0.86         3
         86       1.00      1.00      1.00         6
         87       0.86      0.86      0.86         7
         88       1.00      1.00      1.00         7
         89       0.42      0.83      0.56         6
         90       0.80      0.89      0.84         9
         91       0.83      0.83      0.83         6
         92       0.33      1.00      0.50         3
         93       0.75      1.00      0.86         6
         94       0.62      0.62      0.62         8
         95       0.00      0.00      0.00         8
         96       0.67      0.67      0.67         6
         97       1.00      0.17      0.29         6
         98       0.25      0.50      0.33         2
         99       1.00      1.00      1.00        10

avg / total       0.75      0.72      0.70       640

Ensemble Learning

Bagging -- Building an ensemble of classifiers from bootstrap samples



In [43]:

    
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=None)

bag = BaggingClassifier(base_estimator=tree,
                        n_estimators=500, 
                        max_samples=1.0, 
                        max_features=1.0, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        n_jobs=1, 
                        random_state=1)

from sklearn.metrics import accuracy_score

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print('Bagging train/test accuracies %.3f/%.3f'
      % (bag_train, bag_test))









    



Decision tree train/test accuracies 1.000/0.409
Bagging train/test accuracies 1.000/0.744

Leveraging weak learners via adaptive boosting



In [44]:

    
from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1)

ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=0)

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print('AdaBoost train/test accuracies %.3f/%.3f'
      % (ada_train, ada_test))









    



Decision tree train/test accuracies 0.028/0.008
AdaBoost train/test accuracies 0.609/0.359

	Class	f0	f1	f2	f3	f4	f5	f6	f7	f8	...	f54	f55	f56	f57	f58	f59	f60	f61	f62
0	Acer Campestre	0.003906	0.003906	0.027344	0.033203	0.007812	0.017578	0.023438	0.005859	0.000000	...	0.011719	0.000000	0.005859	0.035156	0.027344	0.033203	0.001953	0.000000	0.017578
1	Acer Campestre	0.005859	0.013672	0.027344	0.025391	0.013672	0.029297	0.019531	0.000000	0.001953	...	0.017578	0.000000	0.021484	0.017578	0.046875	0.005859	0.003906	0.003906	0.046875
2	Acer Campestre	0.011719	0.001953	0.027344	0.044922	0.017578	0.042969	0.023438	0.000000	0.003906	...	0.035156	0.000000	0.015625	0.021484	0.056641	0.009766	0.003906	0.000000	0.015625
3	Acer Campestre	0.013672	0.011719	0.037109	0.017578	0.011719	0.087891	0.023438	0.000000	0.000000	...	0.015625	0.001953	0.021484	0.029297	0.033203	0.003906	0.000000	0.001953	0.027344
4	Acer Campestre	0.007812	0.009766	0.027344	0.025391	0.001953	0.005859	0.015625	0.000000	0.005859	...	0.023438	0.001953	0.021484	0.048828	0.056641	0.019531	0.000000	0.000000	0.013672